# Importing the necessary library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
# Importing the datasets
os.listdir(r"C:\Users\ElMehdi\Downloads\Uber\Datasets")
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
uber = pd.read_csv(r"C:\Users\ElMehdi\Downloads\Uber\Datasets/uber-raw-data-janjune-15_sample.csv")
uber.shape
(100000, 4)
# Checking for duplicates
uber.duplicated().sum()
54
uber.drop_duplicates(inplace=True)
uber.duplicated().sum()
0
uber.shape
(99946, 4)
# Checking the data types
uber.dtypes
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
# Checking for null value
uber.isnull().sum()
Dispatching_base_num 0 Pickup_date 0 Affiliated_base_num 1116 locationID 0 dtype: int64
uber['Pickup_date'][0]
'2015-05-02 21:43:00'
type(uber['Pickup_date'][0])
str
uber['Pickup_date'] = pd.to_datetime(uber['Pickup_date'])
uber.dtypes
Dispatching_base_num object Pickup_date datetime64[ns] Affiliated_base_num object locationID int64 dtype: object
# Defragmenting pickup date into month, day and hour
uber['month'] = uber['Pickup_date'].dt.month_name()
uber['weekday'] = uber['Pickup_date'].dt.day_name()
uber['day'] = uber['Pickup_date'].dt.day
uber['hour'] = uber['Pickup_date'].dt.hour
uber['minute'] = uber['Pickup_date'].dt.minute
uber.head(5)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | weekday | day | hour | minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 | May | Saturday | 2 | 21 | 43 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 | January | Tuesday | 20 | 19 | 52 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 | March | Thursday | 19 | 20 | 26 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 | April | Friday | 10 | 17 | 38 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 | March | Monday | 23 | 7 | 3 |
pivot = pd.crosstab(index = uber['month'], columns = uber['weekday'])
pivot
| weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| month | |||||||
| April | 2365 | 1833 | 2508 | 2052 | 2823 | 1880 | 2521 |
| February | 2655 | 1970 | 2550 | 2183 | 2396 | 2129 | 2013 |
| January | 2508 | 1353 | 2745 | 1651 | 2378 | 1444 | 1740 |
| June | 2793 | 2848 | 3037 | 2485 | 2767 | 3187 | 2503 |
| March | 2465 | 2115 | 2522 | 2379 | 2093 | 2388 | 2007 |
| May | 3262 | 1865 | 3519 | 2944 | 2627 | 2115 | 2328 |
pivot.plot(kind='bar', figsize=(13,11))
<AxesSubplot:xlabel='month'>
summary = uber.groupby(['weekday', 'hour'], as_index=False).size()
summary
| weekday | hour | size | |
|---|---|---|---|
| 0 | Friday | 0 | 581 |
| 1 | Friday | 1 | 333 |
| 2 | Friday | 2 | 197 |
| 3 | Friday | 3 | 138 |
| 4 | Friday | 4 | 161 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 1044 |
| 164 | Wednesday | 20 | 897 |
| 165 | Wednesday | 21 | 949 |
| 166 | Wednesday | 22 | 900 |
| 167 | Wednesday | 23 | 669 |
168 rows × 3 columns
plt.figure(figsize=(12,10))
sns.pointplot(x="hour", y="size", hue="weekday", data=summary)
<AxesSubplot:xlabel='hour', ylabel='size'>
uber_foil = pd.read_csv(r"C:\Users\ElMehdi\Downloads\Uber\Datasets/Uber-Jan-Feb-FOIL.csv")
uber_foil.shape
(354, 4)
uber_foil.head(5)
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
plt.figure(figsize=(12,8))
sns.boxplot(x='dispatching_base_number', y='active_vehicles', data=uber_foil)
plt.title('Active Vehicles by Dispatching Base')
plt.ylabel('Active Vehicles')
plt.show()
os.listdir(r"C:\Users\ElMehdi\Downloads\Uber\Datasets")
['other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files = os.listdir(r"C:\Users\ElMehdi\Downloads\Uber\Datasets")[-8:]
files.remove('uber-raw-data-janjune-15_sample.csv')
files.remove('uber-raw-data-janjune-15.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
# combine it into one file
final = pd.DataFrame()
path = r"C:\Users\ElMehdi\Downloads\Uber\Datasets"
for file in files :
current_df = pd.read_csv(path+ '/' +file)
final = pd.concat([current_df , final])
final.shape
(4534327, 4)
# checking for duplicate
final.duplicated().sum()
82581
final.drop_duplicates(inplace=True)
final.head(5)
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
| 2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
| 3 | 9/1/2014 0:06:00 | 40.7450 | -73.9889 | B02512 |
| 4 | 9/1/2014 0:11:00 | 40.8145 | -73.9444 | B02512 |
# let's groupe the data with latitude and longitude
rush_uber =final.groupby(['Lat', 'Lon'], as_index=False).size()
rush_uber.head(5)
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
| 3 | 39.8416 | -74.1512 | 1 |
| 4 | 39.9055 | -74.0791 | 1 |
!pip install folium
Requirement already satisfied: folium in c:\users\elmehdi\anaconda3\lib\site-packages (0.14.0) Requirement already satisfied: numpy in c:\users\elmehdi\anaconda3\lib\site-packages (from folium) (1.20.1) Requirement already satisfied: requests in c:\users\elmehdi\anaconda3\lib\site-packages (from folium) (2.28.1) Requirement already satisfied: branca>=0.6.0 in c:\users\elmehdi\anaconda3\lib\site-packages (from folium) (0.6.0) Requirement already satisfied: jinja2>=2.9 in c:\users\elmehdi\anaconda3\lib\site-packages (from folium) (2.11.3) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\elmehdi\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (1.1.1) Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\elmehdi\anaconda3\lib\site-packages (from requests->folium) (2.1.1) Requirement already satisfied: certifi>=2017.4.17 in c:\users\elmehdi\anaconda3\lib\site-packages (from requests->folium) (2020.12.5) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\elmehdi\anaconda3\lib\site-packages (from requests->folium) (1.26.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\elmehdi\anaconda3\lib\site-packages (from requests->folium) (2.10)
# creating a heatmap to show the where the rush pickup using folium library
import folium
basemap = folium.Map()
from folium.plugins import HeatMap
HeatMap(rush_uber).add_to(basemap)
<folium.plugins.heat_map.HeatMap at 0x182bee34250>
basemap